# File:     JOP_RR1_financial_topics.R
# Purpose:  This script applies Nicolo's financial topics to the data
# Input:    /Data/finalData.RData
# Output:   
# Author:   JB

rm(list = ls())
require(tidyverse)
require(ggridges)
# setwd('C:/Users/Jimbo/Dropbox/FED/FED/Paper/JOP/RR1_replication/')
setwd('D:/Dropbox/FED/FED/Paper/JOP/RR1_replication')

toplot <- NULL
load('./output/chatGPT_polite_pairwise_2000.RData')

res%>%
  as_tibble() %>%
  mutate(morePolite = gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite)) %>%
  group_by(speaker2) %>%
  summarise(mean(as.numeric(morePolite,na.rm=T)-1),
            sd = sd(as.numeric(morePolite)-1))


toplot <- res%>%
  as_tibble() %>%
  mutate(morePolite = gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite)) %>%
  count(morePolite,speaker2) %>%
  filter(!grepl(':',morePolite)) %>%
  spread(morePolite,n) %>%
  mutate(share = `2` / (`1` + `2`)) %>%
  mutate(type = 'Longer conversations')


load('./output/chatGPT_polite_pairwise_1000.RData')
toplot <- toplot %>%
  bind_rows(res%>%
              as_tibble() %>%
              mutate(morePolite = gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite)) %>%
              count(morePolite,speaker2) %>%
              filter(!grepl(':',morePolite)) %>%
              spread(morePolite,n) %>%
              mutate(share = `2` / (`1` + `2`)) %>%
              mutate(type = 'Shorter conversations'))

pdf('./output/figures/polite_simple_comparison.pdf',width = 7,height = 5)
toplot %>%
  ggplot(aes(x = share,y = speaker2,fill = type)) + 
  geom_bar(stat = 'identity',position = 'dodge') + 
  geom_vline(xintercept = .5,linetype = 'dashed') + 
  labs(x = '% of matches where randomly chosen conversation involving Yellen\nis less polite than randomly chosen conversation involving male Fed chair',
       y = 'Male Fed Chairs',
       fill = 'Conversation Type',
       title = 'AI-Annotated Results',
       subtitle = 'Comparing politeness of randomly chosen conversations via Chat-GPT') + 
  theme_bw() + 
  scale_x_continuous(labels = scales::percent,limits = c(0,1))
dev.off()

load('./output/chatGPT_polite_pairwise_2000.RData')
overallT <- t.test(res%>%
         as_tibble() %>%
         mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
         pull(morePolite),mu = .5)

GreenspanT <- t.test(res%>%
                     as_tibble() %>%
                     filter(speaker2 == 'Greenspan') %>%
                     mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
                     pull(morePolite),mu = .5)

BernankeT <- t.test(res%>%
                       as_tibble() %>%
                       filter(speaker2 == 'Bernanke') %>%
                       mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
                       pull(morePolite),mu = .5)

PowellT <- t.test(res%>%
                      as_tibble() %>%
                      filter(speaker2 == 'Powell') %>%
                      mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
                      pull(morePolite),mu = .5)

toplot <- data.frame(lb = GreenspanT$conf.int[1],
           ub = GreenspanT$conf.int[2],
           est = GreenspanT$estimate,
           chair = 'Greenspan',
           type = '2,000 characters') %>%
  bind_rows(data.frame(lb = BernankeT$conf.int[1],
                       ub = BernankeT$conf.int[2],
                       est = BernankeT$estimate,
                       chair = 'Bernanke',
                       type = '2,000 characters')) %>%
  bind_rows(data.frame(lb = PowellT$conf.int[1],
                       ub = PowellT$conf.int[2],
                       est = PowellT$estimate,
                       chair = 'Powell',
                       type = '2,000 characters')) %>%
  as_tibble()

load('./output/chatGPT_polite_pairwise_1000.RData')
overallT <- t.test(res%>%
                     as_tibble() %>%
                     mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
                     pull(morePolite),mu = .5)

GreenspanT <- t.test(res%>%
                       as_tibble() %>%
                       filter(speaker2 == 'Greenspan') %>%
                       mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
                       pull(morePolite),mu = .5)

BernankeT <- t.test(res%>%
                      as_tibble() %>%
                      filter(speaker2 == 'Bernanke') %>%
                      mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
                      pull(morePolite),mu = .5)

PowellT <- t.test(res%>%
                    as_tibble() %>%
                    filter(speaker2 == 'Powell') %>%
                    mutate(morePolite = as.numeric(gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite))-1) %>%
                    pull(morePolite),mu = .5)

toplot <- toplot %>%
  bind_rows(data.frame(lb = GreenspanT$conf.int[1],
                     ub = GreenspanT$conf.int[2],
                     est = GreenspanT$estimate,
                     chair = 'Greenspan',
                     type = '1,000 characters')) %>%
  bind_rows(data.frame(lb = BernankeT$conf.int[1],
                       ub = BernankeT$conf.int[2],
                       est = BernankeT$estimate,
                       chair = 'Bernanke',
                       type = '1,000 characters')) %>%
  bind_rows(data.frame(lb = PowellT$conf.int[1],
                       ub = PowellT$conf.int[2],
                       est = PowellT$estimate,
                       chair = 'Powell',
                       type = '1,000 characters')) %>%
  as_tibble()

pdf('./output/figures/polite_simple_comparison.pdf',width = 7,height = 5)
toplot %>%
  ggplot(aes(x = est,y = chair,color = type)) + 
  geom_point(position = position_dodge(width = .2)) + 
  geom_errorbarh(aes(xmin = lb,xmax = ub),height = .1,
                 position = position_dodge(width = .2)) + 
  geom_vline(xintercept = .5,linetype = 'dashed') + 
  labs(x = '% of matches where randomly chosen conversation involving Yellen\nis less polite than randomly chosen conversation involving male Fed chair',
       y = 'Male Fed Chairs',
       color = 'Conversation Length',
       title = 'AI-Annotated Results',
       subtitle = 'Comparing politeness of randomly chosen conversations via Chat-GPT') + 
  theme_bw() + 
  scale_x_continuous(labels = scales::percent,limits = c(0,1)) + 
  theme(legend.position = 'bottom')
dev.off()

# Can we do this pairwise?
load('./data/finalData.RData')
toSample <- utterance_level %>%
  # filter(docID == 'fed2001-02-13.txt') %>%
  arrange(docID,ind) %>%
  select(docID,date,speaker,opensecretsID,ind,nchars,textclean) %>%
  # filter(nchars < 1000) %>%
  group_by(docID) %>%
  mutate(firstFED = ifelse(grepl("FED",opensecretsID),ind,NA)) %>%
  mutate(firstFED = min(firstFED,na.rm=T)) %>%
  # select(docID,ind,firstFED,speaker) %>%
  filter(ind >= firstFED) %>%
  arrange(docID,ind) %>%
  select(docID,date,ind,speaker,textclean,opensecretsID) %>%
  mutate(delta = ind - lag(ind)) %>%
  filter(delta == 1) %>%
  mutate(delta2 = ind - lag(ind)) %>%
  mutate(delta2 = ifelse(is.na(delta2),1,delta2)) %>%
  # mutate(chunkIndicator = cumsum(delta2 != 1)) %>%
  mutate(chunkIndicator = cumsum(speaker != lag(speaker,2,default = speaker[1]))) %>%
  group_by(docID,chunkIndicator) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  mutate(nchars = nchar(textclean)) %>%
  filter(nchars < 1000) %>%
  mutate(delta2 = ind - lag(ind)) %>%
  mutate(delta2 = ifelse(is.na(delta2),1,delta2)) %>%
  mutate(chunkIndicator = cumsum(delta2 != 1)) %>%
  group_by(docID,chunkIndicator) %>%
  mutate(n = n()) %>%
  # arrange(desc(n))
  filter(n > 2) %>%
  mutate(textclean = paste0(gsub('\\.$','',speaker),': ',textclean)) %>%
  group_by(docID,date,chunkIndicator,n) %>%
  summarise(text = paste(textclean,collapse = '\n')) %>%
  ungroup() %>%
  mutate(fed = ifelse(date < as.Date('2006-01-01'),'Greenspan',
                      ifelse(date < as.Date('2014-01-01'),'Bernanke',
                             ifelse(date < as.Date('2018-01-01'),'Yellen','Powell')))) %>%
  mutate(nchars = nchar(text))

# # slice(c(1:3,600:603)) %>%
# select(-docID,-date,-chunkIndicator) %>%
# group_by(fed) %>%
# mutate(id = row_number()) %>%
# spread(fed,text)# %>%

chunks <- list()
counter <- 1
while(counter < 2000) {
  tmp <- toSample %>%
    arrange(nchars) %>%
    filter(nchars < 2000) %>%
    group_by(fed) %>%
    sample_n(size = 1) %>%
    ungroup()
  
  for(i in 1:3) {
    chunks[[counter]] <- list()
    chunks[[counter]]$chunk <- paste(paste0('Conversation ',1:2,':\\n',
                                            tmp %>% slice(4,i) %>%
                                              pull(text),collapse = '\\n\\n'))
    chunks[[counter]]$ids <- c('Yellen',tmp$fed[i])
    counter <- counter + 1
  }
}

res <- NULL
for(i in 7:length(chunks)) {
  prompts <- create_prompt(chunks[[i]]$chunk)
  
  Sys.sleep(5)
  system.time(openai_completions <- try(submit_openai(prompt = prompts,temperature = 0,n = 1)))
  while(class(openai_completions) == 'try-error') {
    Sys.sleep(10)
    system.time(openai_completions <- try(submit_openai(prompt = prompts,temperature = 0,n = 1)))
  }
  # openai_completions
  res <- res %>%
    bind_rows(data.frame(speaker1 = chunks[[i]]$ids[1],
             speaker2 = chunks[[i]]$ids[2],
             morePolite = openai_completions$choices$message.content))
}

res %>%
  as_tibble() %>%
  mutate(morePolite = gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite)) %>%
  count(morePolite,speaker2)

save(res,file = './output/chatGPT_polite_pairwise_2000.RData')

# Identify conversations based on changes in speakers
load('./output/chatGPT_polite_pairwise_1000.RData')

res%>%
  as_tibble() %>%
  mutate(morePolite = gsub('\\.|Conversation |The most polite interaction is Conversation | is the most polite','',morePolite)) %>%
  count(morePolite,speaker2) %>%
  mutate(#speaker2 = ifelse(morePolite == '1','Yellen',speaker2),
         n = ifelse(morePolite == '1',n*-1,n)) %>%
  ggplot(aes(x = n,y = speaker2,fill = n > 0)) + 
  geom_bar(stat = 'identity') + 
  geom_vline(xintercept = 0)

require(BradleyTerry2)
